modified from http://cs.stanford.edu/people/karpathy/cs231nfiles/minimal_net.html
# A bit of setup
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'spectral'
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
np.random.seed(0)
N = 500 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')
for j in xrange(K):
ix = range(N*j,N*(j+1))
r = np.linspace(0.05,1,N) # radius
t = np.linspace(j*(K+1),(j+1)*(K+1),N) + np.random.randn(N)*0.2 # theta
X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
y[ix] = j
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40,linewidths=0)#, cmap=plt.cm.Spectral)
plt.xlim([-r[-1]*1.1,r[-1]*1.1])
plt.ylim([-r[-1]*1.1,r[-1]*1.1])
#fig.savefig('spiral_raw.png')
#Train a Linear Classifier
# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# gradient descent loop
num_examples = X.shape[0]
for i in xrange(200):
# evaluate class scores, [N x K]
scores = np.dot(X, W) + b
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
# compute the loss: average cross-entropy loss and regularization
corect_logprobs = -np.log(probs[range(num_examples),y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W)
loss = data_loss + reg_loss
if i % 10 == 0:
print "iteration %d: loss %f" % (i, loss)
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y] -= 1
dscores /= num_examples
# backpropate the gradient to the parameters (W,b)
dW = np.dot(X.T, dscores)
db = np.sum(dscores, axis=0, keepdims=True)
dW += reg*W # regularization gradient
# perform a parameter update
W += -step_size * dW
b += -step_size * db
# evaluate training set accuracy
scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))
sigmoid = lambda x : 1.0 / (1.0 + np.exp(-x))
tanh = lambda x: np.tanh(x)
import numpy.random as npr
# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, s=40)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#fig.savefig('spiral_linear.png')
random_starts = []
for i in range(1000):
h = 30 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))
random_starts.append(mats_to_vec(W,b,W2,b2))
random_starts = np.array(random_starts)
pdist = distance.pdist(random_starts)
dist_mat = distance.squareform(pdist)
plt.imshow(dist_mat,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between random parameter initializations')
plt.savefig('l2-pairwise-param-vec-distances-30h-random.jpg',dpi=600)
def train_net(h=30,D=D,K=K,num_iter=10000):
# initialize parameters randomly
#h = 30 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# keep a record of training progress
params = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
num_params = len(params)
run_record = np.zeros((num_iter,num_params))
loss_record = np.zeros(num_iter)
# stochastic gradient descent loop
#mini_batch_size = len(X)
mini_batch_size=100
for i in xrange(num_iter):
# minibatch
if mini_batch_size < len(X):
mask = npr.rand(len(X)) < (1.0*mini_batch_size / len(X))
X_ = X[mask]
y_ = y[mask]
else:
X_ = X[:]
y_ = y[:]
num_examples = len(X)#X_.shape[0]
# evaluate class scores, [N x K]
#hidden_layer = tanh(np.dot(X, W) + b)
#hidden_layer = sigmoid(np.dot(X, W) + b)
hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
scores = np.dot(hidden_layer, W2) + b2
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
# compute the loss: average cross-entropy loss and regularization
# compute full loss:
compute_full_loss=False
if mini_batch_size == len(X) or compute_full_loss:
corect_logprobs = -np.log(probs[range(len(X)),y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
loss = data_loss + reg_loss
if i % 1000 == 0:
print "iteration %d: loss %f" % (i, loss)
loss_record[i] = loss
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y] -= 1
dscores /= num_examples
# compute minibatch loss for gradient purposes:
if mini_batch_size < len(X):
num_examples = X_.shape[0]
hidden_layer = np.maximum(0, np.dot(X_, W) + b) # note, ReLU activation
scores = np.dot(hidden_layer, W2) + b2
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
corect_logprobs = -np.log(probs[range(num_examples),y_])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
loss = data_loss + reg_loss
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y_] -= 1
dscores /= num_examples
# backpropate the gradient to the parameters
# first backprop into parameters W2 and b2
dW2 = np.dot(hidden_layer.T, dscores)
db2 = np.sum(dscores, axis=0, keepdims=True)
# next backprop into hidden layer
dhidden = np.dot(dscores, W2.T)
# backprop the ReLU non-linearity
dhidden[hidden_layer <= 0] = 0
# finally into W,b
dW = np.dot(X_.T, dhidden)
db = np.sum(dhidden, axis=0, keepdims=True)
# add regularization gradient contribution
dW2 += reg * W2
dW += reg * W
# perform a parameter update
W += -step_size * dW
b += -step_size * db
W2 += -step_size * dW2
b2 += -step_size * db2
run_record[i] = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
return run_record
run_record1 = train_net()
run_records = []
# how far away are the endpoints of multiple runs?
from time import time
t = time()
for i in range(80):
run_records.append(train_net())
print(i)
print(time() - t)
start_points = np.array([run_record[0] for run_record in run_records])
pdist = distance.pdist(start_points)
max(pdist),min(pdist)
def pairwise_distance_over_time(run_records):
n = len(run_records[0])
current_points = np.array([run_record[0] for run_record in run_records])
pdist = distance.pdist(current_points)
num_dist = len(pdist)
distances = np.zeros((n,num_dist))
for i in range(n):
current_points = np.array([run_record[i] for run_record in run_records])
pdist = distance.pdist(current_points)
distances[i] = pdist
return distances
def plot_pairwise(run_records):
distances = pairwise_distance_over_time(run_records)
plt.plot(np.mean(distances,1))
for i in range(distances.shape[1]):
plt.scatter(range(distances.shape[0]),distances[:,i],
linewidths=0,alpha=0.5)
distances = pairwise_distance_over_time(run_records)
plt.plot(np.mean(distances,1))
distances.shape
#lin_dist = distances.reshape(np.prod(distances.shape))
lin_dist=np.zeros(np.prod(distances.shape))
for i in range(len(distances.T)):
lin_dist[i*distances.shape[0]:(i+1)*distances.shape[0]] = distances[:,i]
iter_num = np.hstack((range(len(distances)) for _ in range(distances.shape[1])))
plt.scatter(iter_num[::100],lin_dist[::100],
linewidth=0)
plt.scatter(iter_num[iter_num<2000][::15],lin_dist[iter_num<2000][::15],
linewidth=0,alpha=0.005);
dist_vecs = [distance.pdist(run[::10]) for run in run_records]
np.array(dist_vecs).shape
avg_dist_vec = np.mean(np.array(dist_vecs),0)
avg_pdist = distance.squareform(avg_dist_vec)
for i,dist_vec in enumerate(dist_vecs):
plt.imshow(distance.squareform(dist_vec),
interpolation='none',
cmap='Blues')
plt.savefig('distmat {0}.jpg'.format(i),dpi=600)
which examples are most typical and which examples are least typical?
plt.imshow(avg_pdist,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
std_dist_vec = np.std(np.array(dist_vecs),0)
std_pdist = distance.squareform(std_dist_vec)
plt.imshow(std_pdist,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.hist2d(iter_num[iter_num<2000],lin_dist[iter_num<2000],bins=50,
cmap='Blues');
plt.colorbar()
for i in range(distances[:,:100].shape[1]):
plt.scatter(range(distances.shape[0]),distances[:,i],
linewidths=0,alpha=0.5)
end_points = np.array([run_record[-1] for run_record in run_records])
end_points.shape
pdist = distance.pdist(end_points)
max(pdist),min(pdist)
run_records_ = np.vstack(run_records)
run_records_.shape
run_records_[::50].shape
all_dist = distance.pdist(run_records_[npr.rand(len(run_records_))<0.001])
plt.hist(all_dist,bins=100);
pca = PCA()
pca.fit(run_records_[::10])
X_ = pca.transform(run_records_)
pca.explained_variance_ratio_[:2]
c = np.hstack([i*np.ones(len(run_records[0])) for i in range(len(run_records))])
c.shape
c[::thinning].shape
thinning=50
plt.scatter(X_[::thinning,0],X_[::thinning,1],
c=c[::thinning],
alpha=0.5,
linewidths=0)
plt.xlabel('PC1')
plt.ylabel('PC2')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_[::thinning,0],X_[::thinning,1],X_[::thinning,2],
alpha=0.5,c=c[::thinning],linewidths=0)
run_record.shape
D,h,K
l1 = lambda x,y : sum(abs(x-y))
l2 = lambda x,y : np.sqrt(sum((x-y)**2))
norm = l2
distance_from_start = np.array([norm(run_record[i],run_record[0]) for i in xrange(len(run_record))])
plt.rcParams['font.family'] = 'serif'
plt.plot(distance_from_start)
plt.hlines(max(distance_from_start),0,len(distance_from_start),linestyle='--')
plt.title('Distance from initial parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
best_index = np.argmin(loss_record)
distance_from_best = np.array([norm(run_record[i],run_record[best_index]) for i in xrange(len(run_record))])
best_index
plt.plot(distance_from_best)
plt.vlines(best_index,0,max(distance_from_best),linestyle='--')
plt.title('Distance from best parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
plt.plot(loss_record)
plt.title('Loss')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.ylim(0,max(loss_record))
plt.vlines(best_index,0,max(loss_record),linestyle='--')
worst_index = np.argmax(loss_record)
distance_from_worst = np.array([norm(run_record[i],run_record[worst_index]) for i in xrange(len(run_record))])
worst_index
plt.plot(distance_from_worst)
plt.vlines(worst_index,0,max(distance_from_worst),linestyle='--')
plt.title('Distance from worst parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
# evaluate training set accuracy
hidden_layer = np.maximum(0, np.dot(X, W) + b)
scores = np.dot(hidden_layer, W2) + b2
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))
def accuracy(vec,X=X,y=y,D=D,h=h,K=K):
W,b,W2,b2 = vec_to_mats(vec,D,h,K)
hidden_layer = np.maximum(0, np.dot(X, W) + b)
scores = np.dot(hidden_layer, W2) + b2
predicted_class = np.argmax(scores, axis=1)
return (np.mean(predicted_class == y))
def loss(vec,X=X,y=y,D=D,h=h,K=K):
W,b,W2,b2 = vec_to_mats(vec,D,h,K)
hidden_layer = np.maximum(0, np.dot(X, W) + b)
scores = np.dot(hidden_layer, W2) + b2
num_examples = len(X)
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
corect_logprobs = -np.log(probs[range(num_examples),y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss=0
#reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
return data_loss + reg_loss
D
run_records[0][0].shape
%timeit loss(run_records[0][0])
interp = lambda initial,final,alpha=0.5: (1-alpha)*initial + alpha*final
dim = len(end_points[0])
alpha= np.arange(-0.2,1.5,0.05)
inds = range(len(start_points))
line_losses = [[loss(interp(start_points[i],end_points[i],a)) for a in alpha] for i in inds]
line_losses = [[loss(interp(np.zeros(dim),end_points[i],a)) for a in alpha] for i in inds]
line_losses = [[loss(interp(np.zeros(dim),run_records[i][-1],a)) for a in alpha] for i in inds]
line_losses = np.array(line_losses)
line_losses.shape
np.min(line_losses)
plt.plot(line_losses[0]);
best_solution = min([loss(e) for e in end_points])
best_solution
[plt.plot(alpha,line_losses[i],alpha=0.3) for i in range(100)];
plt.hlines(best_solution,alpha[0],alpha[-1],linestyle='--',color='grey')
plt.vlines(1,0,np.max(line_losses),linestyle='--',color='grey')
plt.xlabel('alpha')
plt.ylabel('loss')
np.mean(abs(end_points))
np.mean(abs(npr.randn(1000)))
hmm so what does this look like if you pick a random direction? or take one good network and scramble its weight vector? or interpolate between two good networks?
rand_end_points = npr.randn(10000,dim)
line_losses = [[loss(interp(np.zeros(dim),rand_end_points[i],alpha)) for alpha in np.arange(-1,1,0.1)] for i in range(10000)]
[plt.plot(line) for line in line_losses[:10]];
np.min(line_losses),np.max(line_losses)
np.array(line_losses).shape
rand_min_losses = np.array([np.min(l) for l in line_losses])
plt.hist(rand_min_losses,bins=50,normed=True,log=True);
plt.vlines(best_solution,0,130.0,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(np.min(rand_min_losses),0,130.0,linestyle='--',color='blue',label='Best random direction')
plt.legend(loc='best')
plt.xlabel('Loss')
plt.ylabel('Probability density')
plt.title('Line search in random directions doesn\'t work')
scrambled_end_points = [end_points[0][sorted(range(dim),key=lambda i:npr.rand())] for _ in range(100)]
np.min(line_losses)
end_points.shape
best_ind = np.argmin(np.array([loss(e) for e in end_points]))
best_ind
loss(end_points[best_ind])
line_losses = np.array([[loss(interp(np.zeros(dim),scrambled_end_points[best_ind],a)) for a in alpha] for i in inds])
plt.bar(range(len(end_points.T)),sorted(np.mean(end_points,0)));
stats.ttest_1samp(end_points[:,i],0)[0]
from scipy import stats
sig = np.zeros(end_points.shape[1])
for i in range(end_points.shape[1]):
#mean,(low,up) = stats.bayes_mvs(end_points[:,i],alpha=0.95)[0]
##sig[i] = (low*up>0)*abs(mean / (up-low))
sig[i]=abs(stats.ttest_1samp(end_points[:,i],0)[0])
plt.plot(sig)
mats=vec_to_mats(sig,D,h,K)
fig, axes = plt.subplots(nrows=2, ncols=2)
for i,ax in enumerate(axes.flat):
im = ax.imshow(mats[i],cmap='Blues')
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im,cax=cbar_ax)#, cax=cbar_ax)
Hmm so it look like the biases learn consistent values across multiple runs
end_points[:,0].shape
plt.hist([l1(np.zeros(dim),e) for e in end_points],bins=50);
plt.hist([l2(np.zeros(dim),e) for e in end_points],bins=100);
aligned_end_points = heuristic_alignment(end_points,D,h,K)
raw_distances = distance.pdist(end_points)
aligned_distances = distance.pdist(aligned_end_points)
raw_distances.mean(),aligned_distances.mean()
raw_distances.min(),aligned_distances.min()
plt.hist([l2(np.zeros(dim),18*npr.randn(dim)/14) for _ in range(1000)],bins=100);
plt.plot(line_losses[10])
compared with the angle from start to finish, what is the angle from start to current?
start_to_f = [e - i for (e,i) in zip(end_points,start_points)]
e = end_points[0]
e.dot(-e)/(np.sqrt(sum(e**2))*np.sqrt(sum(e**2)))
angle = lambda x,y:x.dot(y)/(np.sqrt(sum(x**2))*np.sqrt(sum(y**2)))
r = run_records[0]
plt.plot(angle(start_to_f[0],r.T))
[plt.plot(angle(start_to_f[i],run_records[i].T),alpha=0.5) for i in range(100)];
plt.xlabel('Iteration #')
plt.ylabel('Cosine similarity w.r.t. final')
angle(end_points[40],end_points[41])
run_records[0].shape
run_records[0][:,:-lag].dot(run_records[0][:,:-lag].T).shape
def lag_angle(record,lag=10):
n = len(record) - lag
angles = np.zeros(n)
for i in range(n):
angles[i] = angle(record[i],record[i+lag])
return angles
run_records[:10][0].shape
lag=200
[plt.plot(lag_angle(r[:1200],lag),alpha=0.5) for r in run_records];
plt.xlabel('Iteration #')
plt.ylabel('Cosine similarity at lag {0}'.format(lag))
cosine_distances = distance.pdist(end_points,'cosine')-1
distance.squareform(cosine_distances).shape
np.mean(cosine_distances),np.min(cosine_distances),np.max(cosine_distances)
distance.cosine(np.ones(200),np.ones(200))
plt.imshow(distance.squareform(distance.pdist(r[::10],'cosine')),cmap='Blues')
How sensitive is this result to the angle? i.e. \(\left|\frac{\partial \min J}{\partial \theta}\right|\)?
Now it's convenient to define a loss function in terms of just the direction, since the minimum can be found easily using a coarse line search
Line loss
min_line_loss = lambda vec,loss=loss,alpha=np.arange(0.0,1.5,0.075):np.min([loss(interp(np.zeros(len(vec)),vec,a)) for a in alpha])
loss_as_f_alpha = lambda vec,loss=loss:lambda alpha:loss(alpha*vec)
from scipy import optimize
direction = end_points[best_ind]
loss_in_best_dir = loss_as_f_alpha(direction)
optimize.fmin(loss_as_f_alpha(direction),1.0)
t = time()
optimize.fmin(loss_in_best_dir,1.0,maxfun=30,disp=0)
print(time() - t)
%timeit min_line_loss(npr.randn(dim))
min_line_loss(end_points[best_ind])
def gradient(func,x0,h=0.001):
x0 = np.array(x0)#,dtype=float)
y = func(x0)
deriv = np.zeros(len(x0))
for i in range(len(x0)):
x = np.array(x0)
x[i] += h
deriv[i] = (func(x) - y)/h
return deriv
%timeit gradient(min_line_loss,end_points[best_ind])
new_dir = end_points[best_ind] - gradient(min_line_loss,end_points[best_ind])
min_line_loss_scipy = lambda vec,loss=loss:optimize.fmin(loss_as_f_alpha(vec),1.0,maxfun=30,disp=0)
gradient(min_line_loss_scipy,end_points[best_ind])[:10]
min_line_loss(new_dir)
npr.seed(0)
new_dir = npr.randn(dim)
directions = [new_dir]
losses = [min_line_loss(new_dir)]
for i in range(10):
new_dir -= 100*gradient(min_line_loss,new_dir)
directions.append(new_dir)
losses.append(min_line_loss(new_dir))
print(losses[-1])
sum(abs(new_dir))
def adaptive_metropolis_hastings(func,x0,num_steps=1000,step_size=0.1,verbose=False):
locs = np.zeros((num_steps+1,len(x0)))
vals = np.zeros(num_steps+1)
locs[0] = x0
vals[0] = func(x0)
num_rejects=0
num_accepts=0
message = ''
adaptation_rate=1.02
steps = np.zeros(num_steps+1)
steps[0] = step_size
for i in range(num_steps):
new_loc = locs[i] + npr.randn(len(locs[i]))*step_size
new_val = func(new_loc)
#accept = npr.rand() < np.exp(-(new_val - vals[i]))#/vals[i]#np.exp(-(new_val - vals[i]))
accept = new_val <= vals[i]#*(1+npr.randn()*0.01)# or (npr.rand() < 0.05)
if not accept:
new_loc = locs[i]
new_val = vals[i]
num_rejects+=1
num_accepts=0
if num_rejects > 10:
if step_size > 1e-3:
step_size/=adaptation_rate
message = message+'\n reduced step size'
else:
num_rejects=0
num_accepts+=1
if num_accepts > 10:
if step_size < 1.0:
step_size*=adaptation_rate
message = message+ '\n increased step size'
locs[i+1] = new_loc
vals[i+1] = new_val
if i % 100 ==0:
print("{0}: {1:.3}".format(i,new_val))
if verbose:
print(message)
message = ''
steps[i+1] = step_size
return locs,vals,steps
np.arange(0.0,1.5,0.1).shape
def min_line_loss(vec,loss=loss,alpha=np.arange(0.0,1.5,0.1)):
return np.min([loss(interp(np.zeros(len(vec)),vec,a)) for a in alpha])
locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind],num_steps=2000,step_size=0.01)
#+npr.randn(dim)*0.01
plt.plot(vals)
plt.hlines(vals[0],0,len(vals),linestyle='--',color='grey')
plt.ylabel('loss')
plt.xlabel('Hill-climbing step')
plt.plot(steps)
np.mean(abs(end_points[best_ind]))
np.mean(abs(npr.randn(dim)))
random_directions=npr.randn(100,dim)
best_rand = np.argmin([min_line_loss(r) for r in random_directions])
multiple_hc_runs_near = []
for i in range(10):
locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind]+npr.randn(dim)*0.5,num_steps=1000,step_size=0.01)
multiple_hc_runs_near.append((locs,vals,steps))
#locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind]+0.5*npr.randn(dim),num_steps=5000,step_size=0.01)
# random initialization
for soln in multiple_hc_runs[:-1]:
vals = soln[1]
plt.plot((np.arange(len(vals))*15),vals)
vals = multiple_hc_runs[-1][1]
plt.plot((np.arange(len(vals))*15),vals,label='Hill-climbing + line search')
plt.hlines(best_solution,0,(len(vals))*15,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(10000,0,1,linestyle='--',color='grey')#'blue',label='Num fxn evaluations used in SGD search')
plt.xlabel('Loss function evaluations')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.title('Random initialization')
for soln in multiple_hc_runs_near[:-1]:
vals = soln[1]
plt.plot((np.arange(len(vals))*15),vals)
vals = multiple_hc_runs_near[-1][1]
plt.plot((np.arange(len(vals))*15),vals,label='Hill-climbing + line search')
plt.hlines(best_solution,0,(len(vals))*15,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(10000,0,1,linestyle='--',color='grey')#'blue',label='Num fxn evaluations used in SGD search')
plt.xlabel('Loss function evaluations')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.title('Initialized near SGD solution')
hc_solns = np.array([run[0][-1] for run in multiple_hc_runs_near])
hc_angles = []
for i in range(len(hc_solns)):
for j in range(i):
if i!=j:
hc_angles.append(angle(hc_solns[i],hc_solns[j]))
hc_angles = np.array(hc_angles)
plt.hist(hc_angles,bins=20)
plt.plot(steps)
sum(abs(gradient(min_line_loss,new_dir)))
plt.plot(sorted(gradient(min_line_loss,end_points[best_ind])))
from scipy.spatial import distance
pdist = distance.pdist(run_record)
dist_mat = distance.squareform(pdist)
from matplotlib import colors
ln = colors.LogNorm()
plt.imshow(dist_mat,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-3h-3-class-normed.jpg',dpi=600)
def pairwise_distances(X):
pdist=np.zeros((len(X),len(X)))
for i in range(len(X)):
for j in range(len(X)):
#
pbc[i,j] = BC_jit(X[i],X[j])
return pbc
W,b,W2,b2 = vec_to_mats(run_record[-1],D,h,K)
# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = np.dot(np.maximum(0, np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b), W2) + b2
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Greys, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y,linewidths=0)#, s=40)#, cmap=plt.cm.Greys)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#fig.savefig('spiral_net.png')
in_dim = 2
hidden_dim = 10
out_dim = 2
W1 = npr.randn(in_dim,hidden_dim)
W2 = npr.randn(hidden_dim,out_dim)
npr.randn(2).dot(W1)
from scipy.misc import comb
np.prod(range(1,20))
np.prod(W1.shape) + np.prod(W2.shape)
parameter_vector = np.hstack((W1.reshape(np.prod(W1.shape)),W2.reshape(np.prod(W2.shape))))
def mats_to_vec(W,b,W2,b2):
return np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
def vec_to_mats(param_vector,D,h,K):
''' D is input dimensionality, h is number of hidden units, K is number of output units '''
index = 0
W = param_vector[index:index + D*h].reshape((D,h))
index += D*h
b = param_vector[index:index+h].reshape((1,h))
index += h
W2 = param_vector[index:index+h*K].reshape((h,K))
index+= h*K
b2 = param_vector[index:index +K].reshape((1,K))
return W,b,W2,b2
mats = vec_to_mats(run_record[0],D,h,K)
W1,b1,W2,b2 = mats
W1.shape,b1.shape,W2.shape,b2.shape
def find_permutation(mats):
''' simple heuristic, permute according to the column-wise mean values of W1'''
#return range(W1.shape[1])
W1,b1,W2,b2 = mats
#return sorted(range(W1.shape[1]),key=lambda i:b1[0,i])
#return sorted(range(W1.shape[1]),key=lambda i:np.max(W1[:,i])+np.max(W2[i,:]))
#return sorted(range(W1.shape[1]),key=lambda i:np.max(W2[i,:]))
return sorted(range(W1.shape[1]),key=lambda i:np.max(W1[:,i]))
perm = find_permutation(mats)
def permuted_vector(perm,W1,b1,W2,b2):
D,h = W1.shape
K = b2.shape[1]
#perm = find_permutation(W1)
return mats_to_vec(W1[:,perm],b1[:,perm],W2[perm,:],b2)
def heuristic_alignment(record,D,h,K):
perm_record = np.zeros(record.shape)
for i in range(len(record)):
mats = vec_to_mats(record[i],D,h,K)
perm_record[i] = permuted_vector(find_permutation(mats),*mats)
return perm_record
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(run_record)
plt.plot(pca.explained_variance_ratio_[:20])
plt.plot(pca.transform(run_record)[:,0],label='PC1',color='blue',
linewidth=3)
plt.plot(pca.transform(run_record)[:,1],label='PC2',color='blue',
linewidth=2,alpha=0.7)
plt.plot(pca.transform(run_record)[:,2],label='PC3',color='blue',
alpha=0.5)
plt.plot(pca.transform(run_record)[:,3],label='PC4',color='blue',
linestyle='--',alpha=0.5)
plt.plot(pca.transform(run_record)[:,4],label='PC5',color='blue',
linestyle='--',alpha=0.25)
plt.plot(pca.transform(run_record)[:,5],label='PC6+',color='grey',
linestyle='--',alpha=0.1)
for i in range(6,10):
plt.plot(pca.transform(run_record)[:,i],color='grey',
linestyle='--',alpha=0.1)
plt.xlabel('Iteration #')
plt.legend(loc='best')
plt.title('Training dynamics projected onto leading principal components')
reduced_dynamics = pca.transform(run_record)[:,:10]
sum(pca.explained_variance_ratio_[:3])
run_record.shape
plt.plot(abs(pca.components_[1]))
plt.plot(sorted(abs(pca.components_[0])))
sum(abs(pca.components_[0])>0.05)
plt.scatter(pca.transform(run_record)[:,0],pca.transform(run_record)[:,1],
c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca.transform(run_record)[:,0],pca.transform(run_record)[:,1],pca.transform(run_record)[:,2],
c=range(num_iter),linewidths=0,alpha=0.5)
#plt.colorbar()
D=2
aligned_record = heuristic_alignment(run_record,D,h,K)
from scipy.spatial import distance
pdist = distance.pdist(aligned_record)
dist_mat = distance.squareform(pdist)
plt.imshow(dist_mat,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-sorted-3h-3-class.jpg',dpi=600)
h=30
perms = [find_permutation(vec_to_mats(vec,D,h,K)) for vec in run_record]
def find_most_common_perm(record, D,h,K):
perms = [find_permutation(vec_to_mats(vec,D,h,K)) for vec in record]
perms = np.array(perms)
len([p for p in perms])
len(set([tuple(p) for p in perms]))
plt.plot(sorted(np.std(perms,0)))
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(aligned_record)
plt.plot(pca.explained_variance_ratio_[:20])
sum(pca.explained_variance_ratio_[:2])
plt.scatter(pca.transform(aligned_record)[:,0],pca.transform(aligned_record)[:,1],
c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()
pca.fit(globally_aligned_record)
sum(pca.explained_variance_ratio_[:3])
plt.scatter(pca.transform(globally_aligned_record)[:,0],pca.transform(globally_aligned_record)[:,1],
c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()
perm_set_list = list(set([tuple(p) for p in perms]))
mapping = dict(zip(list(set([tuple(p) for p in perms])),range(len(perm_set_list))))
mapped = np.array([mapping[tuple(p)] for p in perms])
res = plt.hist(mapped,bins=83);
most_freq_perm = perms[mapped==np.argmax(res[0])][0]
most_freq_perm = perms[mapped==66][0]
most_freq_perm
globally_aligned_record = np.zeros(run_record.shape)
for i in range(len(run_record)):
mats = vec_to_mats(run_record[i],D,h,K)
globally_aligned_record[i] = permuted_vector(most_freq_perm,*mats)
from scipy.spatial import distance
pdist = distance.pdist(globally_aligned_record)#[::10])
dist_mat = distance.squareform(pdist)
plt.imshow(dist_mat,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-global-sorted-30h.jpg',dpi=600)
plt.plot(mapped==66)
best_index = np.argmin(loss_record)
distance_from_best = np.array([norm(aligned_record[i],aligned_record[best_index]) for i in xrange(len(run_record))])
best_index
plt.plot(distance_from_best)
plt.vlines(best_index,0,max(distance_from_best),linestyle='--')
plt.title('Adjusted distance from best parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))
X =
# can we reduce this to a sorting problem?
sorted(range(len(W2)),key=lambda i:np.mean(W1[:,i]))
W1.shape
# greedy alignment
# let's try this for autoencoders!
# initialize parameters randomly
h = 100 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,D)
b2 = np.zeros((1,D))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# keep a record of training progress
params = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
num_params = len(params)
num_iter=10000
run_record = np.zeros((num_iter,num_params))
loss_record = np.zeros(num_iter)
# stochastic gradient descent loop
#mini_batch_size = len(X)
mini_batch_size=20
for i in xrange(num_iter):
# minibatch
if mini_batch_size < len(X):
mask = npr.rand(len(X)) < (1.0*mini_batch_size / len(X))
X_ = X[mask]
y_ = y[mask]
else:
X_ = X[:]
y_ = y[:]
num_examples = len(X)#X_.shape[0]
# evaluate class scores, [N x K]
#hidden_layer = tanh(np.dot(X, W) + b)
#hidden_layer = sigmoid(np.dot(X, W) + b)
hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
outputs = np.dot(hidden_layer, W2) + b2
# compute the loss: average cross-entropy loss and regularization
# compute full loss:
compute_full_loss=True
if mini_batch_size == len(X) or compute_full_loss:
#corect_logprobs = -np.log(probs[range(len(X)),y])
data_loss = np.sqrt(np.sum(abs(outputs - X)**2))/num_examples
reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
loss = data_loss + reg_loss
if i % 1000 == 0:
print "iteration %d: loss %f" % (i, loss)
loss_record[i] = loss
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y] -= 1
dscores /= num_examples
# compute minibatch loss for gradient purposes:
if mini_batch_size < len(X):
num_examples = X_.shape[0]
hidden_layer = np.maximum(0, np.dot(X_, W) + b) # note, ReLU activation
scores = np.dot(hidden_layer, W2) + b2
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
corect_logprobs = -np.log(probs[range(num_examples),y_])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
loss = data_loss + reg_loss
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y_] -= 1
dscores /= num_examples
# backpropate the gradient to the parameters
# first backprop into parameters W2 and b2
dW2 = np.dot(hidden_layer.T, dscores)
db2 = np.sum(dscores, axis=0, keepdims=True)
# next backprop into hidden layer
dhidden = np.dot(dscores, W2.T)
# backprop the ReLU non-linearity
dhidden[hidden_layer <= 0] = 0
# finally into W,b
dW = np.dot(X_.T, dhidden)
db = np.sum(dhidden, axis=0, keepdims=True)
# add regularization gradient contribution
dW2 += reg * W2
dW += reg * W
# perform a parameter update
W += -step_size * dW
b += -step_size * db
W2 += -step_size * dW2
b2 += -step_size * db2
run_record[i] = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))